## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## [conflicted] Will prefer dplyr::filter over any other package.
## [conflicted] Will prefer dplyr::select over any other package.
## [conflicted] Will prefer dplyr::slice over any other package.
## [conflicted] Will prefer dplyr::rename over any other package.
## [conflicted] Will prefer dplyr::intersect over any other package.
## Rows: 1137099 Columns: 23
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (16): chr, strand, cell_line, tool, circ_id, circ_id_strand, count_group...
## dbl (7): start, end, BSJ_count, n_detected, n_db, estim_len_in, BSJ_count_m...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1560 Columns: 48
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (27): chr, strand, circ_id, circ_id_strand, tool, cell_line, FWD_primer,...
## dbl (21): start, end, FWD_pos, FWD_length, REV_pos, REV_length, FWD_TM, REV_...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 32 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (2): tool, count_group_median
## dbl (6): nr_detected, nr_expected, sensitivity, perc_compound_val, total_n, ...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
ENST
## [1] 17461
# NA or ambiguous => strand needs to be taken into account
all_circ %>% select(circ_id_strand, ENST) %>% unique() %>%
filter(is.na(ENST) | ENST == 'ambiguous') %>%
count(ENST)# total nr
# ensembl = read_tsv('/Users/mivromma/Documents/PhD/indexes/Homo_sapiens.GRCh38.103.gtf',
# col_names = c('chr', 'havana', 'type', 'start', 'end', 'dot', 'strand', 'dot2', 'info'))
#
# ensembl %>% filter(type == "transcript") %>%
# separate(info, into = c('gene_id', 'gene_version', 'transcrip_id', 'rest'),
# extra = 'merge', sep = ';') %>%
# select(transcrip_id) %>% unique()
17461/234393## [1] 0.07449455
# can = read_tsv('known_exons_GRCh38.103_canonical.bed', col_names = c('chr', 'start', 'end', 'exon', 'dot', 'strand'))
#
# can %>% separate(exon, into = c('ENST', 'ENST_nr', 'exon', 'exon_nr'), sep = '_') %>%
# select(ENST) %>% unique() %>%
# inner_join(all_circ %>% select(ENST) %>% unique())
17461/60427## [1] 0.2889602
with introns
all_circ %>% select(chr, start, end, estim_len_in) %>% unique() %>%
pull(estim_len_in) %>% quantile()## 0% 25% 50% 75% 100%
## 31 433 1428 8391 999911
no introns
all_circ %>% select(chr, start, end, estim_len_ex) %>% unique() %>%
filter(!(is.na(estim_len_ex) | estim_len_ex == 'ambiguous')) %>%
mutate(estim_len_ex = as.numeric(estim_len_ex)) %>%
pull(estim_len_ex) %>% quantile()## 0% 25% 50% 75% 100%
## 1 275 485 900 37221
do not get validated (no introns)
cq %>% filter(qPCR_val == 'pass',
RR_val == 'fail',
amp_seq_val == 'pass') %>%
select(chr, start, end, estim_len_ex) %>% unique() %>%
filter(!(is.na(estim_len_ex) | estim_len_ex == 'ambiguous')) %>%
mutate(estim_len_ex = as.numeric(estim_len_ex)) %>%
pull(estim_len_ex) %>% quantile()## 0% 25% 50% 75% 100%
## 115.00 2109.75 2381.50 3483.00 4758.00
## 0% 25% 50% 75% 100%
## 1 1 1 3 4007
how many with BSJ < 5
## [1] 0.866
how many with BSJ >= 2
## [1] 503237
## [1] 0.461